In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from impala.dbapi import connect
from impala.util import as_pandas
In [ ]:
# connect to impala
conn = connect(host="mycluster.domain.com", port=my_impala_port_number)
# Make sure we"re pulling from the right location
cur = conn.cursor()
cur.execute("use my_db")
In [ ]:
# Need to select time range:
# Try Aug 1-2, 2016.
# Aug 1 - starttimebox = 1470009600000
# Aug 2 - endtimebox = 1470096000000
# Aug 5 = 1470355200000
In [ ]:
# Following query returns averages of SpO2 & RR values for each encounterid.
# Needs to be reshaped into columnar format.
query_RRT = """
SELECT
ce.encntr_id
, ce.event_cd
, cv_event_cd.description AS event_description
, AVG(CAST(ce.result_val as int)) as avg
FROM clinical_event ce
LEFT OUTER JOIN code_value cv_event_cd
ON ce.event_cd = cv_event_cd.code_value
WHERE ce.encntr_id IN (
SELECT DISTINCT encntr_id
FROM clinical_event
WHERE event_cd = '54411998'
AND result_status_cd NOT IN ('31', '36')
AND valid_until_dt_tm > unix_timestamp()
AND event_class_cd not in ('654645')
)
AND ce.event_cd IN ('3623994', '703540')
AND ce.performed_dt_tm > 1470009600000
AND ce.performed_dt_tm < 1470096000000
GROUP BY ce.encntr_id, ce.event_cd, cv_event_cd.description
ORDER BY ce.encntr_id;
"""
In [ ]:
cur.execute(query_RRT)
df_RRT = as_pandas(cur)
In [ ]:
df_RRT.head()
In [ ]:
df_RRT.dtypes
In [ ]:
# O2 (oxygen)
df_O2RRT = df_RRT[df_RRT.event_cd=='3623994']
# Respiration Rate
df_RRRRT = df_RRT[df_RRT.event_cd=='703540']
In [ ]:
df_O2RRT.head()
In [ ]:
df_RRRRT.head()
In [ ]:
df_finRRT = pd.merge(df_O2RRT, df_RRRRT, on='encntr_id')
In [ ]:
df_finRRT.head()
In [ ]:
# Following query returns averages of SpO2 & RR values for each encounterid.
# Needs to be reshaped into columnar format.
query_NotRRT = """
SELECT
ce.encntr_id
, ce.event_cd
, cv_event_cd.description AS event_description
, AVG(CAST(ce.result_val as int)) as avg
FROM clinical_event ce
LEFT OUTER JOIN code_value cv_event_cd
ON ce.event_cd = cv_event_cd.code_value
WHERE ce.encntr_id NOT IN (
SELECT DISTINCT encntr_id
FROM clinical_event
WHERE event_cd = '54411998'
AND result_status_cd NOT IN ('31', '36')
AND valid_until_dt_tm > unix_timestamp()
AND event_class_cd not in ('654645')
)
AND ce.event_cd IN ('3623994', '703540')
AND ce.performed_dt_tm > 1470009600000
AND ce.performed_dt_tm < 1470096000000
GROUP BY ce.encntr_id, ce.event_cd, cv_event_cd.description
ORDER BY ce.encntr_id;
"""
In [ ]:
cur.execute(query_NotRRT)
df_NotRRT = as_pandas(cur)
In [ ]:
df_NotRRT.shape
In [ ]:
df_O2NotRRT = df_NotRRT[df_NotRRT.event_cd=='3623994']
df_RRNotRRT = df_NotRRT[df_NotRRT.event_cd=='703540']
In [ ]:
df_finNotRRT = pd.merge(df_O2NotRRT, df_RRNotRRT, on='encntr_id')
In [ ]:
df_finNotRRT.head()
In [ ]:
plt.figure(figsize=(10, 8))
plt.scatter(df_finNotRRT.avg_x, df_finNotRRT.avg_y, s = 30, alpha=0.5, c='blue')
plt.scatter(df_finRRT.avg_x, df_finRRT.avg_y, s = 30, alpha=0.5, c='red')
In [ ]:
# A helper that takes two event codes and makes a pair plot will be useful
def scatter_avgs(event_cd1, event_cd2):
'''
Input: event codes are strings.
This function returns a figure, of the average of the vital signs per encounter
'''
query_RRT = """
SELECT
ce.encntr_id
, ce.event_cd
, cv_event_cd.description AS event_description
, AVG(CAST(ce.result_val as int)) as avg
FROM clinical_event ce
JOIN encounter enc ON ce.encntr_id = enc.encntr_id
LEFT OUTER JOIN code_value cv_event_cd
ON ce.event_cd = cv_event_cd.code_value
WHERE ce.encntr_id IN (
SELECT DISTINCT encntr_id
FROM clinical_event
WHERE event_cd = '54411998'
AND result_status_cd NOT IN ('31', '36')
AND valid_until_dt_tm > unix_timestamp()
AND event_class_cd not in ('654645')
)
AND ce.event_cd IN ('{0}', '{1}')
AND ce.performed_dt_tm > 1470009600000
AND ce.performed_dt_tm < 1470355200000
AND enc.loc_facility_cd = '633867'
GROUP BY ce.encntr_id, ce.event_cd, cv_event_cd.description
ORDER BY ce.encntr_id;
""".format(str(event_cd1), str(event_cd2))
cur.execute(query_RRT)
df_RRT = as_pandas(cur)
df_1RRT = df_RRT[df_RRT.event_cd==str(event_cd1)]
df_2RRT = df_RRT[df_RRT.event_cd==str(event_cd2)]
df_finRRT = pd.merge(df_1RRT, df_2RRT, on='encntr_id')
query_NotRRT = """
SELECT
ce.encntr_id
, ce.event_cd
, cv_event_cd.description AS event_description
, AVG(CAST(ce.result_val as int)) as avg
FROM clinical_event ce
JOIN encounter enc ON ce.encntr_id = enc.encntr_id
LEFT OUTER JOIN code_value cv_event_cd
ON ce.event_cd = cv_event_cd.code_value
WHERE ce.encntr_id NOT IN (
SELECT DISTINCT encntr_id
FROM clinical_event
WHERE event_cd = '54411998'
AND result_status_cd NOT IN ('31', '36')
AND valid_until_dt_tm > unix_timestamp()
AND event_class_cd not in ('654645')
)
AND ce.event_cd IN ('{0}', '{1}')
AND ce.performed_dt_tm > 1470009600000
AND ce.performed_dt_tm < 1470355200000
AND enc.loc_facility_cd = '633867'
GROUP BY ce.encntr_id, ce.event_cd, cv_event_cd.description
ORDER BY ce.encntr_id;
""".format(str(event_cd1), str(event_cd2))
cur.execute(query_NotRRT)
df_NotRRT = as_pandas(cur)
df_1NotRRT = df_NotRRT[df_NotRRT.event_cd==str(event_cd1)]
df_2NotRRT = df_NotRRT[df_NotRRT.event_cd==str(event_cd2)]
df_finNotRRT = pd.merge(df_1NotRRT, df_2NotRRT, on='encntr_id')
# plotting
plt.figure(figsize=(10, 8))
plt.scatter(df_finNotRRT.avg_x, df_finNotRRT.avg_y, s = 40, alpha=0.5, c='blue')
plt.scatter(df_finRRT.avg_x, df_finRRT.avg_y, s = 30, alpha=0.5, c='red')
plt.xlabel(event_cd1)
plt.ylabel(event_cd2)
plt.legend(['Non-RRT patients', 'RRT patients'])
plt.title('Vital sign averages per encounter')
In [ ]:
scatter_avgs('703501','703516') # x = systolic BP; y = diastolic BP
In [ ]:
scatter_avgs('703558','703306') # x= oral temp, y = mean arterial pressure
In [ ]:
scatter_avgs('703565', '4686698') # glasgow coma score, RASS score
In [ ]:
scatter_avgs('4674677', '2700653') # weight, height
In [ ]:
scatter_avgs('703511', '703540') # pulse rate, respiratory rate
plt.xlabel('pulse rate', fontsize = 16)
plt.ylabel('respiratory rate', fontsize=16)
In [ ]:
scatter_avgs('703540', '4690633') # respiratory rate, CO2
plt.xlabel('Respiratory Rate', fontsize=16)
plt.ylabel('')
In [ ]:
scatter_avgs('703540', '2700541') # respiratory rate, heart rate
plt.xlabel('Respiratory Rate', fontsize=16)
plt.ylabel('Heart Rate', fontsize = 16)
In [ ]:
scatter_avgs('3623994', '4690633') # O2, CO2
plt.xlabel('O2', fontsize = 16)
plt.ylabel('CO2', fontsize = 16)
plt.xlim([86, 101])
In [ ]:
## Earlier assumption of averaging out the data --> correct,
## no difference once we look at the avgerage vital signs over time.